model_data = read.csv("D:/UCSB/Spring_2022/PSTAT 131/PSTAT_131_HW/HW2/PSTAT-131/Final Project/data/processed_data.csv")
model_ts = ts(model_data[,-1], frequency = 12, start = c(1990,2))
unemp.ts = model_ts[,"unemploy_rate_la"]
unemp.ts_nocovid = window(unemp.ts, end = c(2019,12))
plot.ts(unemp.ts, main="Unemployment Rate in LA",ylab="Percentage")

plot(unemp.ts[-252],unemp.ts[-1],main="Scatterplot (lag=1)")
abline(lm(unemp.ts[-1] ~ unemp.ts[-252]),col=4)

subset into training and testing data

umemrate_test = window(model_ts, start = 2018)
umemrate_train = window(model_ts, end = 2018)

###time-step features and lag features

Predictive Modeling

Our goal is to use previous data to predict the future value of the unemployment rate

ARIMA Mode Autoregressive Model

AutoRegressive (AR) model is one of the most popular time series model. In this model, each value is regressed to its previous observations. AR(1) is the first order autoregression meaning that the current value is based on the immediately preceding value.

(Unemployment Rate)time series with trends, or with seasonality, are not stationary — the trend and seasonality will affect the value of the time series at different times

one way to make a non-stationary time series stationary — compute the differences between consecutive observations. This is known as differencing. help stabilise the mean of a time series by removing changes in the level of a time series, and therefore eliminating (or reducing) trend and seasonality.

Box.test(diff(unemp.ts_nocovid), lag=10, type="Ljung-Box")
## 
##  Box-Ljung test
## 
## data:  diff(unemp.ts_nocovid)
## X-squared = 201.88, df = 10, p-value < 2.2e-16
acf(unemp.ts_nocovid)

pacf(unemp.ts_nocovid)

Seasonal Differencing

#cbind("Percentage%" =unemp.ts,
      #"Monthly log" = log(unemp.ts),
      #"Annual change in log" = diff(log(unemp.ts),12)) %>%
  #autoplot(facets=TRUE) +
    #xlab("Year") + ylab("") +
    #ggtitle("LA Unemployment Rate")
#unemp.ts %>% diff() %>% ur.kpss() %>% summary()
#ndiffs(unemp.ts_nocovid)
#nsdiffs(unemp.ts_nocovid)
# Simulate AutoRegressive model with 0.5 slope
AR_1 <- arima.sim(model = list(ar = 0.5), n = 200)
# Simulate AutoRegressive model with 0.8 slope
#
AR_2 <- arima.sim(model = list(ar = 0.9), n = 200)
plot.ts(cbind(AR_1 , AR_2), main="AR Model Simulated Data", col = "blue")

cor(unemp.ts[-252],unemp.ts[-1])
## [1] 0.992912

ACF function computes (and by default plots) estimates of the autocovariance or autocorrelation function for different time lags

The x-axis donates the time lag, while the y-axis displays the estimated autocorrelation. Looking at this data, we can say that each observation is positively related to its recent past observations. However, the correlation decreases as the lag increases.

White Noise

A white noise process is a random process of random variables that are uncorrelated, have mean zero, and a finite variance. In other words, a series is called white noise if it is purely random in nature. Random noise is donated by εt.

Plots of white noise series exhibit a very erratic, jumpy, unpredictable behavior. Since the εt are uncorrelated, previous values do not help us to forecast future values. White noise series themselves are quite uninteresting from a forecasting standpoint (they are not linearly forecastable), but they form the building blocks for more general models.

Differencing can help stabilise the mean of a time series by removing changes in the level of a time series, and therefore eliminating (or reducing) trend and seasonality. As well as looking at the time plot of the data, the ACF plot is also useful for identifying non-stationary time series.

library(modeltime)
library(tidymodels)
## -- Attaching packages -------------------------------------- tidymodels 0.2.0 --
## v broom        0.8.0     v recipes      0.2.0
## v dials        0.1.1     v rsample      0.1.1
## v dplyr        1.0.8     v tibble       3.1.6
## v ggplot2      3.3.6     v tidyr        1.2.0
## v infer        1.0.0     v tune         0.2.0
## v modeldata    0.1.1     v workflows    0.2.6
## v parsnip      0.2.1     v workflowsets 0.2.1
## v purrr        0.3.4     v yardstick    0.0.9
## -- Conflicts ----------------------------------------- tidymodels_conflicts() --
## x purrr::discard() masks scales::discard()
## x dplyr::filter()  masks stats::filter()
## x dplyr::lag()     masks stats::lag()
## x recipes::step()  masks stats::step()
## * Dig deeper into tidy modeling with R at https://www.tmwr.org
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v readr   2.1.2     v forcats 0.5.1
## v stringr 1.4.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x readr::col_factor() masks scales::col_factor()
## x purrr::discard()    masks scales::discard()
## x dplyr::filter()     masks stats::filter()
## x stringr::fixed()    masks recipes::fixed()
## x dplyr::lag()        masks stats::lag()
## x readr::spec()       masks yardstick::spec()
library(timetk)
library(lubridate)
## 
## 载入程辑包:'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
data = model_data %>%
  select(DATE, unemploy_rate_la) %>%
  mutate(DATE, DATE = as.Date.character(DATE))

data = data[1:320,]

data %>%plot_time_series(DATE, unemploy_rate_la)
splits <- initial_time_split(data, prop = 0.9)
# Auto ARIMA
model_fit_arima_no_boost <- arima_reg() %>%
    set_engine(engine = "auto_arima") %>%
    fit(unemploy_rate_la ~ DATE, data = training(splits))
## frequency = 12 observations per 1 year
## Warning in FUN(X[[i]], ...): strings not representable in native encoding will
## be translated to UTF-8
## Warning in FUN(X[[i]], ...): unable to translate '<U+00C4>' to native encoding
## Warning in FUN(X[[i]], ...): unable to translate '<U+00D6>' to native encoding
## Warning in FUN(X[[i]], ...): unable to translate '<U+00E4>' to native encoding
## Warning in FUN(X[[i]], ...): unable to translate '<U+00F6>' to native encoding
## Warning in FUN(X[[i]], ...): unable to translate '<U+00DF>' to native encoding
## Warning in FUN(X[[i]], ...): unable to translate '<U+00C6>' to native encoding
## Warning in FUN(X[[i]], ...): unable to translate '<U+00E6>' to native encoding
## Warning in FUN(X[[i]], ...): unable to translate '<U+00D8>' to native encoding
## Warning in FUN(X[[i]], ...): unable to translate '<U+00F8>' to native encoding
## Warning in FUN(X[[i]], ...): unable to translate '<U+00C5>' to native encoding
## Warning in FUN(X[[i]], ...): unable to translate '<U+00E5>' to native encoding
# Boosted Auto ARIMA
model_fit_arima_boosted <- arima_boost(
    min_n = 2,
    learn_rate = 0.015
) %>%
    set_engine(engine = "auto_arima_xgboost") %>%
    fit(unemploy_rate_la ~ DATE + as.numeric(DATE) + factor(month(DATE, label = TRUE), ordered = F),
        data = training(splits))
## frequency = 12 observations per 1 year
# Exponential Smoothing
model_fit_ets <- exp_smoothing() %>%
    set_engine(engine = "ets") %>%
    fit(unemploy_rate_la ~ DATE, data = training(splits))
## frequency = 12 observations per 1 year
#Prophet 
model_fit_prophet <- prophet_reg() %>%
    set_engine(engine = "prophet") %>%
    fit(unemploy_rate_la ~ DATE, data = training(splits))
## Disabling weekly seasonality. Run prophet with weekly.seasonality=TRUE to override this.
## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
# Linear Regression
model_fit_lm <- linear_reg() %>%
    set_engine("lm") %>%
    fit(unemploy_rate_la ~ as.numeric(DATE) + factor(month(DATE, label = TRUE), ordered = FALSE),
        data = training(splits))
library(earth)
## 载入需要的程辑包:Formula
## 载入需要的程辑包:plotmo
## 载入需要的程辑包:plotrix
## 
## 载入程辑包:'plotrix'
## The following object is masked from 'package:scales':
## 
##     rescale
## 载入需要的程辑包:TeachingDemos
# Multivariate Adaptive Regression Spline model 
model_spec_mars <- mars(mode = "regression") %>%
    set_engine("earth") 

recipe_spec <- recipe(unemploy_rate_la ~ DATE, data = training(splits)) %>%
    step_date(DATE, features = "month", ordinal = FALSE) %>%
    step_mutate(date_num = as.numeric(DATE)) %>%
    step_normalize(date_num) %>%
    step_rm(DATE)
  
wflw_fit_mars <- workflow() %>%
    add_recipe(recipe_spec) %>%
    add_model(model_spec_mars) %>%
    fit(training(splits))

Add fitted models to a Model Table

models_tbl <- modeltime_table(
    model_fit_arima_no_boost,
    model_fit_arima_boosted,
    model_fit_ets,
    model_fit_prophet,
    model_fit_lm,
    wflw_fit_mars
)

models_tbl
## # Modeltime Table
## # A tibble: 6 x 3
##   .model_id .model     .model_desc                              
##       <int> <list>     <chr>                                    
## 1         1 <fit[+]>   ARIMA(4,0,1)(2,1,1)[12]                  
## 2         2 <fit[+]>   ARIMA(2,0,2)(1,1,1)[12] W/ XGBOOST ERRORS
## 3         3 <fit[+]>   ETS(A,AD,A)                              
## 4         4 <fit[+]>   PROPHET                                  
## 5         5 <fit[+]>   LM                                       
## 6         6 <workflow> EARTH

Calibrate the model to a testing set

calibration_tbl <- models_tbl %>%
    modeltime_calibrate(new_data = testing(splits))
calibration_tbl
## # Modeltime Table
## # A tibble: 6 x 5
##   .model_id .model     .model_desc                        .type .calibration_da~
##       <int> <list>     <chr>                              <chr> <list>          
## 1         1 <fit[+]>   ARIMA(4,0,1)(2,1,1)[12]            Test  <tibble>        
## 2         2 <fit[+]>   ARIMA(2,0,2)(1,1,1)[12] W/ XGBOOS~ Test  <tibble>        
## 3         3 <fit[+]>   ETS(A,AD,A)                        Test  <tibble>        
## 4         4 <fit[+]>   PROPHET                            Test  <tibble>        
## 5         5 <fit[+]>   LM                                 Test  <tibble>        
## 6         6 <workflow> EARTH                              Test  <tibble>
calibration_tbl %>%
    modeltime_forecast(
        new_data    = testing(splits),
        actual_data = data
    ) %>%
    plot_modeltime_forecast(
      .legend_max_width = 25
    )
calibration_tbl %>%
    modeltime_accuracy() %>%
    table_modeltime_accuracy(
        .interactive = FALSE
    )
Accuracy Table
.model_id .model_desc .type mae mape mase smape rmse rsq
1 ARIMA(4,0,1)(2,1,1)[12] Test 0.73 11.89 2.19 11.09 0.85 0.64
2 ARIMA(2,0,2)(1,1,1)[12] W/ XGBOOST ERRORS Test 0.72 12.68 2.18 11.22 1.01 0.67
3 ETS(A,AD,A) Test 0.60 10.02 1.82 9.35 0.74 0.85
4 PROPHET Test 6.37 100.78 19.17 64.20 6.60 0.52
5 LM Test 2.35 38.82 7.07 30.58 2.67 0.00
6 EARTH Test 0.64 10.57 1.94 9.80 0.75 0.94

Random Forest

Deep Learning